Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied. You are provided with historical sales data for 1,115 Rossmann stores.
# Importing libraries
import numpy as np
import pandas as pd
from pandas import datetime
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import matplotlib.gridspec as gridspec
import plotly.express as px
from IPython.display import display
%matplotlib inline
from sklearn.metrics import mean_squared_error
from statsmodels.distributions.empirical_distribution import ECDF
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARMA,ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from fbprophet import Prophet
import warnings
warnings.filterwarnings("ignore")
# Importing data
train = pd.read_csv("train.csv")
store = pd.read_csv("store.csv")
train.head()
train.info()
# Change Date datatype to datetime type
train['Date'] = pd.to_datetime(train['Date'])
# Date extraction
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['WeekOfYear'] = train['Date'].dt.weekofyear
# Adding new variable
train['SalePerCustomer'] = train['Sales']/train['Customers']
train['SalePerCustomer'].describe()
On average customers spend about 9.50$ per day.
# ECDF: empirical cumulative distribution
sns.set(style = "ticks")
plt.figure(figsize = (12, 6))
plt.subplot(311)
cdf = ECDF(train['Sales'])
plt.plot(cdf.x, cdf.y, label = "statmodels");
plt.title('Sales'); plt.ylabel('ECDF');
# plot second ECDF
plt.subplot(312)
cdf = ECDF(train['Customers'])
plt.plot(cdf.x, cdf.y, label = "statmodels");
plt.title('Customers');
# plot second ECDF
plt.subplot(313)
cdf = ECDF(train['SalePerCustomer'])
plt.plot(cdf.x, cdf.y, label = "statmodels");
plt.title('Sale per Customer');
plt.subplots_adjust(hspace = 0.8)
About 20% of data has zero amount of sales/customers probably due to the fact that the store is closed for state holidays or school holidays
# Copy data for cleaning
df_sales = train.copy()
df_store = store.copy()
# Closed Stores with zero sales
df_sales[(df_sales.Open == 0) & (df_sales.Sales == 0)].head()
len(df_sales[(df_sales.Open == 0) & (df_sales.Sales == 0)])
There're 172817 closed stores in the data. It is about 10% of the total amount of observations.
# Opened stores with zero sales
df_sales[(df_sales.Open == 1) & (df_sales.Sales == 0)].head()
len(df_sales[(df_sales.Open == 1) & (df_sales.Sales == 0)])
There're 54 opend stores but no sales in the data.To avoid any biased forecasts we will drop these values.
# Drop data
df_sales = df_sales[(df_sales["Open"] != 0) & (df_sales['Sales'] != 0)]
df_sales.head()
df_sales.shape
df_store.head()
df_store.info()
# Null value
df_store.isnull().sum()
# Missing values in CompetitionDistance
df_store[pd.isnull(df_store.CompetitionDistance)]
# Fill NaN with a median value (skewed distribuion)
df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace = True)
# Fill other NA by 0
df_store.fillna(0, inplace = True)
# Join sales and store data
df_clean = pd.merge(df_sales, df_store, how = 'inner', on = 'Store')
df_clean.head()
# Sales distribution among stores
df_clean.groupby('StoreType')['Sales'].describe()
StoreType B has the highest average of Sales among all others, however we have much less data for it.
# Total sales and customers among stores
df_clean.groupby('StoreType')['Customers', 'Sales'].sum()
StoreType A comes as first, StoreType D goes on the second place in both Sale and Customers
# sales trends
sns.factorplot(data = df_clean, x = 'Month', y = "Sales",
col = 'StoreType',
palette = 'plasma',
hue = 'StoreType',
row = 'Promo')
# sales trends
sns.factorplot(data = df_clean, x = 'Month', y = "Customers",
col = 'StoreType',
palette = 'plasma',
hue = 'StoreType',
row = 'Promo')
# sale per customer trends
sns.factorplot(data = df_clean, x = 'Month', y = "SalePerCustomer",
col = 'StoreType',
palette = 'plasma',
hue = 'StoreType',
row = 'Promo')
The plots above showed StoreType B as the most selling store but it is not true. The highest SalePerCustomer amount is observed at the StoreType D, about 12€ with Promo and 10€ without. As for StoreType A and C it is about 9€.
We chose only one strore, Store 2 for StoreType A to demonstrate the analysis It also makes sense to downsample the data from days to weeks using the resample method to see the present trends more clearly.
# Choose store 2
store_2 = df_sales[df_sales['Store'] == 2][['Sales', 'Date']]
fig = px.line(store_2,x='Date',y ='Sales')
fig.update_xaxes(
rangeslider_visible=True,
rangeselector = dict(
buttons=list([
dict(count=1, label='1y',step='year',stepmode='backward'),
dict(count=2, label='2y',step='year',stepmode='backward'),
dict(count=3, label='3y',step='year',stepmode='backward'),
dict(step='all')
])))
fig.show()
store_2['Date'].sort_index(ascending = False, inplace=True)
store_2 = store_2.set_index('Date').resample('W').mean()
store_2.head()
plt.figure(figsize=(12, 5))
sns.lineplot(x=store_2.index, y=store_2['Sales'])
f , (ax1, ax2, ax3, ax4)= plt.subplots(4, figsize = (12, 10))
decomposition = seasonal_decompose(store_2['Sales'], model = 'multiplicative')
decomposition.observed.plot(ax = ax1)
ax1.set_ylabel('OBSERVED')
decomposition.trend.plot(ax = ax2)
ax2.set_ylabel('TREND')
decomposition.seasonal.plot(ax = ax3)
ax3.set_ylabel('SEASONAL')
decomposition.resid.plot(ax = ax4)
ax4.set_ylabel('RESIDUAL')
f.subplots_adjust(hspace = 0.5)
Downward trend with seasonal effect
fig = plt.figure(constrained_layout=True, figsize=(15, 4))
grid = gridspec.GridSpec(nrows=1, ncols=2, figure=fig)
ax1 = fig.add_subplot(grid[0, 0])
plot_acf(store_2['Sales'], lags = 20, ax=ax1);
ax2 = fig.add_subplot(grid[0, 1])
plot_pacf(store_2['Sales'], lags = 20, ax=ax2);
plt.show();
# Double check data stationary by using adfuller method.
from statsmodels.tsa.stattools import adfuller
def adf_test(series,title=''):
"""
Pass in a time series and an optional title, returns an ADF report
"""
print(f'Augmented Dickey-Fuller Test: {title}')
result = adfuller(series.dropna(),autolag='AIC')
labels = ['ADF test statistic','p-value','# lags used','# observations']
out = pd.Series(result[0:4],index=labels)
for key,val in result[4].items():
out[f'critical value ({key})']=val
print(out.to_string())
if result[1] <= 0.05:
print("Strong evidence against the null hypothesis")
print("Reject the null hypothesis")
print("Data is stationary")
else:
print("Weak evidence against the null hypothesis")
print("Fail to reject the null hypothesis")
print("Data is non-stationary")
adf_test(store_2['Sales'])
The data is good enough to perform time series models.
# Process test data
train = store_2[:120]
test = store_2[120:]
train.head()
# auto_arima help us choose the optimal model, sometime manually tweaking model hyperparameters yeild better result.
model=auto_arima(train['Sales'],m=7,seasonal=False,
start_p=0, start_q=0, max_order = 5, test = 'adf',
error_action='ignore', suppress_warning=True,
stepwise=True, trace = True)
model.summary()
#ARMA
model1 = ARMA(train['Sales'], order=(2,0))
results=model1.fit()
start=len(train)
end=len(train)+len(test)-1
predictions1 = results.predict(start=start, end=end)
predictions1.index = test.index
test['Sales'].plot()
predictions1.plot(label='prediction');
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(test['Sales'], predictions1)))
model=auto_arima(train['Sales'],m=7,seasonal=True,
start_p=0, start_q=0, max_order = 5, test = 'adf',
error_action='ignore', suppress_warning=True,
stepwise=True, trace = True)
model.summary()
model = SARIMAX(train['Sales'],order=(2, 0, 0),seasonal_order=(0, 0, 1, 7))
results = model.fit()
start=len(train)
end=len(train)+len(test)-1
predictions1 = results.predict(start=start, end=end, dynamic=False)
predictions1.index = test.index
test['Sales'].plot()
predictions1.plot(label='prediction');
plt.legend()
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(test['Sales'], predictions1)))
df_prophet = store_2.reset_index()[['Date','Sales']].rename({'Date':'ds','Sales':'y'}, axis='columns')
df_prophet.head()
df_prophet.tail(20)
train_prophet = df_prophet[:120]
test_prophet = df_prophet[120:]
test_prophet.head()
model = Prophet(interval_width=0.95,yearly_seasonality=True,weekly_seasonality=True)
model.fit(train_prophet)
future = model.make_future_dataframe(periods=15,freq='W')
future.tail()
forecast = model.predict(future)
forecast[['ds','yhat','yhat_lower','yhat_upper']].tail(20)
pd.concat([df_prophet.set_index('ds')['y'],forecast.set_index('ds')['yhat']],axis=1).plot()
model.plot(forecast);
model.plot_components(forecast);
from fbprophet.plot import add_changepoints_to_plot
fig = model.plot(forecast)
a = add_changepoints_to_plot(fig.gca(),model,forecast)
df=df_sales[df_sales['Store'] == 2][['Sales', 'Date']]
df['Date'].sort_index(ascending = False, inplace=True)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))
data=scaler.fit_transform(df[['Sales']])
train = int(len(data)*0.75)
train_data,test_data=data[:train],data[train:]
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
dataX.append(a)
dataY.append(dataset[i + look_back, 0])
return np.array(dataX), np.array(dataY)
# reshape into X=t and y=t+1
look_back = 1
train_X, train_y = create_dataset(train_data, look_back)
test_X, test_y = create_dataset(test_data, look_back)
train_X.shape, train_y.shape
# reshape input to be [sample, time steps, features] which is required for LSTM
train_X =train_X.reshape(train_X.shape[0], 1, train_X.shape[1])
test_X = test_X.reshape(test_X.shape[0], 1, test_X.shape[1])
### Create the LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
model=Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(1,1)))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')
model.fit(train_X,train_y,validation_data=(test_X,test_y),epochs=100,batch_size=1,verbose=1)
#Model Prediction
train_predict = model.predict(train_X)
test_predict = model.predict(test_X)
#Transforming data back to original form
train_predict=scaler.inverse_transform(train_predict)
test_predict=scaler.inverse_transform(test_predict)
## Calculate RMSE performance metrics
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(train_y,train_predict))